# Step 1 -> Read data into R workstation
# RCurl is the R package to read csv file using a link
library(RCurl)
url <- paste0("https://raw.githubusercontent.com",
"/analyticsbook/book/main/data/AD.csv")
AD <- read.csv(text=getURL(url))
# str(AD)
# Step 2 -> Data preprocessing
# Create your X matrix (predictors) and Y vector (outcome variable)
X <- AD[,2:16]
Y <- AD$DX_bl
# The following code makes sure the variable "DX_bl" is a "factor".
# It denotes "0" as "c0" and "1" as "c1", to highlight the fact
# that "DX_bl" is a factor variable, not a numerical variable.
Y <- paste0("c", Y)
# as.factor is to convert any variable into the
# format as "factor" variable.
Y <- as.factor(Y)
# Then, we integrate everything into a data frame
data <- data.frame(X,Y)
names(data)[16] = c("DX_bl")
set.seed(1) # generate the same random sequence
# Create a training data (half the original data size)
train.ix <- sample(nrow(data),floor( nrow(data)/2) )
data.train <- data[train.ix,]
# Create a testing data (half the original data size)
data.test <- data[-train.ix,]
# Step 3 -> Use glm() function to build a full model
# with all predictors
logit.AD.full <- glm(DX_bl~., data = data.train,
family = "binomial")
summary(logit.AD.full)
##
## Call:
## glm(formula = DX_bl ~ ., family = "binomial", data = data.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4250 -0.3645 -0.0704 0.2074 3.1707
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 43.97098 7.83797 5.610 2.02e-08 ***
## AGE -0.07304 0.03875 -1.885 0.05945 .
## PTGENDER 0.48668 0.46682 1.043 0.29716
## PTEDUCAT -0.24907 0.08714 -2.858 0.00426 **
## FDG -3.28887 0.59927 -5.488 4.06e-08 ***
## AV45 2.09311 1.36020 1.539 0.12385
## HippoNV -38.03422 6.16738 -6.167 6.96e-10 ***
## e2_1 0.90115 0.85564 1.053 0.29225
## e4_1 0.56917 0.54502 1.044 0.29634
## rs3818361 -0.47249 0.45309 -1.043 0.29703
## rs744373 0.02681 0.44235 0.061 0.95166
## rs11136000 -0.31382 0.46274 -0.678 0.49766
## rs610932 0.55388 0.49832 1.112 0.26635
## rs3851179 -0.18635 0.44872 -0.415 0.67793
## rs3764650 -0.48152 0.54982 -0.876 0.38115
## rs3865444 0.74252 0.45761 1.623 0.10467
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 349.42 on 257 degrees of freedom
## Residual deviance: 139.58 on 242 degrees of freedom
## AIC: 171.58
##
## Number of Fisher Scoring iterations: 7
# Step 4 -> use step() to automatically delete
# all the insignificant
# variables
# Also means, automatic model selection
logit.AD.reduced <- step(logit.AD.full, direction="both",
trace = 0)
summary(logit.AD.reduced)
##
## Call:
## glm(formula = DX_bl ~ AGE + PTEDUCAT + FDG + AV45 + HippoNV +
## rs3865444, family = "binomial", data = data.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.38957 -0.42407 -0.09268 0.25092 2.73658
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 42.68795 7.07058 6.037 1.57e-09 ***
## AGE -0.07993 0.03650 -2.190 0.02853 *
## PTEDUCAT -0.22195 0.08242 -2.693 0.00708 **
## FDG -3.16994 0.55129 -5.750 8.92e-09 ***
## AV45 2.62670 1.18420 2.218 0.02655 *
## HippoNV -36.22215 5.53083 -6.549 5.79e-11 ***
## rs3865444 0.71373 0.44290 1.612 0.10707
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 349.42 on 257 degrees of freedom
## Residual deviance: 144.62 on 251 degrees of freedom
## AIC: 158.62
##
## Number of Fisher Scoring iterations: 7
# Step 4 continued
anova(logit.AD.reduced,logit.AD.full,test = "LRT")
## Analysis of Deviance Table
##
## Model 1: DX_bl ~ AGE + PTEDUCAT + FDG + AV45 + HippoNV + rs3865444
## Model 2: DX_bl ~ AGE + PTGENDER + PTEDUCAT + FDG + AV45 + HippoNV + e2_1 +
## e4_1 + rs3818361 + rs744373 + rs11136000 + rs610932 + rs3851179 +
## rs3764650 + rs3865444
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 251 144.62
## 2 242 139.58 9 5.0438 0.8305
# The argument, test = "LRT", means that the p-value
# is derived via the Likelihood Ratio Test (LRT).
# Step 5 -> test the significance of the logistic model
# Test residual deviance for lack-of-fit
# (if > 0.10, little-to-no lack-of-fit)
dev.p.val <- 1 - pchisq(logit.AD.reduced$deviance,
logit.AD.reduced$df.residual)
dev.p.val
## [1] 1
# Step 6 -> Predict on test data using your
# logistic regression model
y_hat <- predict(logit.AD.reduced, data.test)
# Step 7 -> Evaluate the prediction performance of
# your logistic regression model
# (1) Three main metrics for classification: Accuracy,
# Sensitivity (1- False Positive),
# Specificity (1 - False Negative)
y_hat2 <- y_hat
y_hat2[which(y_hat > 0)] = "c1"
# Since y_hat here is the values from the linear equation
# part of the logistic regression model, by default,
# we should use 0 as a cut-off value (only by default,
# not optimal though), i.e., if y_hat < 0, we name it
# as one class, and if y_hat > 0, it is another class.
y_hat2[which(y_hat < 0)] = "c0"
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
# confusionMatrix() in the package "caret" is a powerful
# function to summarize the prediction performance of a
# classification model, reporting metrics such as Accuracy,
# Sensitivity (1- False Positive),
# Specificity (1 - False Negative), to name a few.
library(e1071)
confusionMatrix(table(y_hat2, data.test$DX_bl))
## Confusion Matrix and Statistics
##
##
## y_hat2 c0 c1
## c0 117 29
## c1 16 97
##
## Accuracy : 0.8263
## 95% CI : (0.7745, 0.8704)
## No Information Rate : 0.5135
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.6513
##
## Mcnemar's Test P-Value : 0.07364
##
## Sensitivity : 0.8797
## Specificity : 0.7698
## Pos Pred Value : 0.8014
## Neg Pred Value : 0.8584
## Prevalence : 0.5135
## Detection Rate : 0.4517
## Detection Prevalence : 0.5637
## Balanced Accuracy : 0.8248
##
## 'Positive' Class : c0
##
# (2) ROC curve is another commonly reported metric for
# classification models
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# pROC has the roc() function that is very useful here
plot(roc(data.test$DX_bl, y_hat),
col="blue", main="ROC Curve")
## Setting levels: control = c0, case = c1
## Setting direction: controls < cases

## coefficients and 95% CI
cbind(coef = coef(logit.AD.reduced), confint(logit.AD.reduced))
## Waiting for profiling to be done...
## coef 2.5 % 97.5 %
## (Intercept) 42.68794758 29.9745022 57.88659748
## AGE -0.07993473 -0.1547680 -0.01059348
## PTEDUCAT -0.22195425 -0.3905105 -0.06537066
## FDG -3.16994212 -4.3519800 -2.17636447
## AV45 2.62670085 0.3736259 5.04703489
## HippoNV -36.22214822 -48.1671093 -26.35100122
## rs3865444 0.71373441 -0.1348687 1.61273264
# Remark: how to obtain the 95% CI of the predictions
y_hat <- predict(logit.AD.reduced, data.test, type = "link",
se.fit = TRUE)
# se.fit = TRUE, is to get the standard error in the predictions,
# which is necessary information for us to construct
# the 95% CI of the predictions
data.test$fit <- y_hat$fit
data.test$se.fit <- y_hat$se.fit
# We can readily convert this information into the 95\% CIs
# of the predictions (the way these 95\% CIs are
# derived are again, only in approximated sense).
# CI for fitted values
data.test <- within(data.test, {
# added "fitted" to make predictions at appended temp values
fitted = exp(fit) / (1 + exp(fit))
fit.lower = exp(fit - 1.96 * se.fit) / (1 +
exp(fit - 1.96 * se.fit))
fit.upper = exp(fit + 1.96 * se.fit) / (1 +
exp(fit + 1.96 * se.fit))
})
## odds ratios and 95% CI
exp(cbind(OR = coef(logit.AD.reduced),
confint(logit.AD.reduced)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 3.460510e+18 1.041744e+13 1.379844e+25
## AGE 9.231766e-01 8.566139e-01 9.894624e-01
## PTEDUCAT 8.009520e-01 6.767113e-01 9.367202e-01
## FDG 4.200603e-02 1.288128e-02 1.134532e-01
## AV45 1.382807e+01 1.452993e+00 1.555605e+02
## HippoNV 1.857466e-16 1.205842e-21 3.596711e-12
## rs3865444 2.041601e+00 8.738306e-01 5.016501e+00
# Fit a logistic regression model with FDG
logit.AD.FDG <- glm(DX_bl ~ FDG, data = AD, family = "binomial")
summary(logit.AD.FDG)
##
## Call:
## glm(formula = DX_bl ~ FDG, family = "binomial", data = AD)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4686 -0.8166 -0.2758 0.7679 2.7812
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 18.3300 1.7676 10.37 <2e-16 ***
## FDG -2.9370 0.2798 -10.50 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 711.27 on 516 degrees of freedom
## Residual deviance: 499.00 on 515 degrees of freedom
## AIC: 503
##
## Number of Fisher Scoring iterations: 5
logit.AD.FDG <- glm(DX_bl ~ FDG, data = data.train,
family = "binomial")
y_hat <- predict(logit.AD.FDG, data.test, type = "link",
se.fit = TRUE)
data.test$fit <- y_hat$fit
data.test$se.fit <- y_hat$se.fit
# CI for fitted values
data.test <- within(data.test, {
# added "fitted" to make predictions at appended temp values
fitted = exp(fit) / (1 + exp(fit))
fit.lower = exp(fit - 1.96 * se.fit) / (1 + exp(fit - 1.96 *
se.fit))
fit.upper = exp(fit + 1.96 * se.fit) / (1 + exp(fit + 1.96 *
se.fit))
})
library(ggplot2)
newData <- data.test[order(data.test$FDG),]
newData$DX_bl = as.numeric(newData$DX_bl)
newData$DX_bl[which(newData$DX_bl==1)] = 0
newData$DX_bl[which(newData$DX_bl==2)] = 1
newData$DX_bl = as.numeric(newData$DX_bl)
p <- ggplot(newData, aes(x = FDG, y = DX_bl))
# predicted curve and point-wise 95\% CI
p <- p + geom_ribbon(aes(x = FDG, ymin = fit.lower,
ymax = fit.upper), alpha = 0.2)
p <- p + geom_line(aes(x = FDG, y = fitted), colour="red")
# fitted values
p <- p + geom_point(aes(y = fitted), size=2, colour="red")
# observed values
p <- p + geom_point(size = 2)
p <- p + ylab("Probability") + theme(text = element_text(size=18))
p <- p + labs(title =
"Observed and predicted probability of disease")
print(p)

# install.packages("reshape2")
require(reshape2)
## Loading required package: reshape2
data.train$ID <- c(1:dim(data.train)[1])
AD.long <- melt(data.train[,c(1,3,4,5,6,16,17)],
id.vars = c("ID", "DX_bl"))
# Plot the data using ggplot
require(ggplot2)
p <- ggplot(AD.long, aes(x = factor(DX_bl), y = value))
# boxplot, size=.75 to stand out behind CI
p <- p + geom_boxplot(size = 0.75, alpha = 0.5)
# points for observed data
p <- p + geom_point(position = position_jitter(w = 0.05, h = 0),
alpha = 0.1)
# diamond at mean for each group
p <- p + stat_summary(fun = mean, geom = "point", shape = 18,
size = 6, alpha = 0.75, colour = "red")
# confidence limits based on normal distribution
p <- p + stat_summary(fun.data = "mean_cl_normal",
geom = "errorbar", width = .2, alpha = 0.8)
p <- p + facet_wrap(~ variable, scales = "free_y", ncol = 3)
p <- p + labs(title =
"Boxplots of variables by diagnosis (0 - normal; 1 - patient)")
print(p)

library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
##
## smiths
## The following object is masked from 'package:RCurl':
##
## complete
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(ggplot2)
theme_set(theme_gray(base_size = 15) )
# define monitoring function. data0: reference data;
# data.real.time: real-time data; wsz: window size
Monitoring <- function( data0, data.real.time, wsz ){
num.data.points <- nrow(data.real.time)
stat.mat <- NULL
importance.mat <- NULL
for( i in 1:num.data.points ){
# at the start of monitoring, when real-time data size is
# smaller than the window size, combine the real-time
# data points and random samples from the reference data
# to form a data set of wsz
if(i<wsz){
# sample.size.from.reference (ssfr)
ssfr <- wsz - i
sample.reference <- data0[sample(nrow(data0),
ssfr,replace = TRUE), ]
current.real.time.data <- rbind(sample.reference,
data.real.time[1:i,,drop=FALSE])
}else{
current.real.time.data <- data.real.time[(i-wsz+
1):i,,drop=FALSE]
}
current.real.time.data$class <- 1
data <- rbind( data0, current.real.time.data )
colnames(data) <- c(paste0("X",1:(ncol(data)-1)),
"Class")
data$Class <- as.factor(data$Class)
# apply random forests to the data
my.rf <- randomForest(Class ~ .,sampsize=c(wsz,wsz), data=data)
# get importance score
importance.mat <- rbind(importance.mat, t(my.rf$importance))
# get monitoring statistics
ooblist <- my.rf[5]
oobcolumn=matrix(c(ooblist[[1]]),2:3)
ooberrornormal= (oobcolumn[,3])[1]
ooberrorabnormal=(oobcolumn[,3])[2]
temp=my.rf[6]
p1vote <- mean(temp$votes[,2][(nrow(data0)+1):nrow(data)])
this.stat <- c(ooberrornormal,ooberrorabnormal,p1vote)
stat.mat <- rbind(stat.mat, this.stat)
}
result <- list(importance.mat = importance.mat,
stat.mat = stat.mat)
return(result)
}
# data generation
# sizes of reference data, real-time data without change,
# and real-time data with changes
length0 <- 100
length1 <- 100
length2 <- 100
# 2-dimension
dimension <- 2
# reference data
data0 <- rnorm( dimension * length0, mean = 0, sd = 1)
# real-time data with no change
data1 <- rnorm( dimension * length2, mean = 0, sd = 1)
# real-time data different from the reference data in the
# second the variable
data2 <- cbind( V1 = rnorm( 1 * length1, mean = 0, sd = 1),
V2 = rnorm( 1 * length1, mean = 2, sd = 1) )
# convert to data frame
data0 <- matrix(data0, nrow = length0, byrow = TRUE) %>%
as.data.frame()
data1 <- matrix(data1, nrow = length2, byrow = TRUE) %>%
as.data.frame()
data2 <- data2 %>% as.data.frame()
# assign variable names
colnames( data0 ) <- paste0("X",1:ncol(data0))
colnames( data1 ) <- paste0("X",1:ncol(data1))
colnames( data2 ) <- paste0("X",1:ncol(data2))
# assign reference data with class 0 and real-time data with class 1
data0 <- data0 %>% mutate(class = 0)
data1 <- data1 %>% mutate(class = 1)
data2 <- data2 %>% mutate(class = 1)
# real-time data consists of normal data and abnormal data
data.real.time <- rbind(data1,data2)
data.plot <- rbind( data0, data1 ) %>% mutate(class = factor(class))
ggplot(data.plot, aes(x=X1, y=X2, shape = class, color=class)) +
geom_point(size=3)

data.plot <- rbind( data0, data2 ) %>% mutate(class = factor(class))
ggplot(data.plot, aes(x=X1, y=X2, shape = class,
color=class)) + geom_point(size=3)

wsz <- 10
result <- Monitoring( data0, data.real.time, wsz )
stat.mat <- result$stat.mat
importance.mat <- result$importance.mat
# plot different monitor statistics
stat.mat <- data.frame(stat.mat)
stat.mat$id <- 1:nrow(stat.mat)
colnames(stat.mat) <- c("error0","error1","prob","id")
stat.mat <- stat.mat %>% gather(type, statistics, error0,
error1,prob)
ggplot(stat.mat,aes(x=id,y=statistics,color=type)) +
geom_line(linetype = "dashed") + geom_point() +
geom_point(size=2)

# plot importance scores for diagnosis
importance.mat <- data.frame(importance.mat)
importance.mat$id <- 1:nrow(importance.mat)
colnames(importance.mat) <- c("X1","X2","id")
importance.mat <- importance.mat %>%
gather(variable, importance,X1,X2)
ggplot(importance.mat,aes(x=id,y=importance,
color=variable)) + geom_line(linetype = "dashed") +
geom_point(size=2)

# 10-dimensions, with 2 variables being changed from
# the normal condition
dimension <- 10
wsz <- 5
# reference data
data0 <- rnorm( dimension * length0, mean = 0, sd = 1)
# real-time data with no change
data1 <- rnorm( dimension * length1, mean = 0, sd = 1)
# real-time data different from the reference data in the
# second the variable
data2 <- c( rnorm( (dimension - 2) * length2, mean = 0, sd = 1),
rnorm( (2) * length2, mean = 20, sd = 1))
# convert to data frame
data0 <- matrix(data0, nrow = length0, byrow = TRUE) %>%
as.data.frame()
data1 <- matrix(data1, nrow = length1, byrow = TRUE) %>%
as.data.frame()
data2 <- matrix(data2, ncol = 10, byrow = FALSE) %>%
as.data.frame()
# assign reference data with class 0 and real-time data
# with class 1
data0 <- data0 %>% mutate(class = 0)
data1 <- data1 %>% mutate(class = 1)
data2 <- data2 %>% mutate(class = 1)
# real-time data consists of normal data and abnormal data
data.real.time <- rbind(data1,data2)
result <- Monitoring( data0, data.real.time, wsz )
stat.mat <- result$stat.mat
importance.mat <- result$importance.mat
# plot different monitor statistics
stat.mat <- data.frame(stat.mat)
stat.mat$id <- 1:nrow(stat.mat)
colnames(stat.mat) <- c("error0","error1","prob","id")
stat.mat <- stat.mat %>% gather(type, statistics, error0,
error1,prob)
ggplot(stat.mat,aes(x=id,y=statistics,color=type))+
geom_line(linetype = "dashed") + geom_point() +
geom_point(size=2)

# plot importance scores for diagnosis
importance.mat <- data.frame(importance.mat)
importance.mat$id <- 1:nrow(importance.mat)
# colnames(importance.mat) <- c("X1","X2","id")
importance.mat <- importance.mat %>%
gather(variable, importance,X1:X10)
importance.mat$variable <- factor( importance.mat$variable,
levels = paste0( "X", 1:10))
# levels(importance.mat$variable) <- paste0( "X", 1:10 )
ggplot(importance.mat,aes(x=id,y=importance,color=
variable)) + geom_line(linetype = "dashed") +
geom_point(size=2)

# Create the frequency table in accordance of categorization
# of HippoNV
temp = quantile(AD$HippoNV,seq(from = 0.05, to = 0.95,
by = 0.05))
AD$HippoNV.category <- cut(AD$HippoNV, breaks=c(-Inf,
temp, Inf))
tempData <- data.frame(xtabs(~DX_bl + HippoNV.category,
data = AD))
tempData <- tempData[seq(from = 2, to =
2*length(unique(AD$HippoNV.category)),
by = 2),]
summary(xtabs(~DX_bl + HippoNV.category, data = AD))
## Call: xtabs(formula = ~DX_bl + HippoNV.category, data = AD)
## Number of cases in table: 517
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 229.23, df = 19, p-value = 4.822e-38
tempData$Total <- colSums(as.matrix(xtabs(~DX_bl +
HippoNV.category,data = AD)))
tempData$p.hat <- 1 - tempData$Freq/tempData$Total
tempData$HippoNV.category = as.numeric(tempData$HippoNV.category)
str(tempData)
## 'data.frame': 20 obs. of 5 variables:
## $ DX_bl : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ HippoNV.category: num 1 2 3 4 5 6 7 8 9 10 ...
## $ Freq : int 24 25 25 21 22 15 17 17 19 11 ...
## $ Total : num 26 26 26 26 26 25 26 26 26 34 ...
## $ p.hat : num 0.0769 0.0385 0.0385 0.1923 0.1538 ...
# Draw the scatterplot of HippoNV.category
# versus the probability of normal
library(ggplot2)
p <- ggplot(tempData, aes(x = HippoNV.category, y = p.hat))
p <- p + geom_point(size=3)
p <- p + geom_smooth(method = "loess")
p <- p + labs(title ="Empirically observed probability of normal"
, xlab = "HippoNV")
print(p)
## `geom_smooth()` using formula 'y ~ x'

require(rpart)
## Loading required package: rpart
ndata <- 2000
X1 <- runif(ndata, min = 0, max = 1)
X2 <- runif(ndata, min = 0, max = 1)
data <- data.frame(X1,X2)
data <- data %>% mutate( X12 = 0.5 * (X1 - X2), Y =
ifelse(X12>=0,1,0))
ix <- which( abs(data$X12) <= 0.05)
data$Y[ix] <- ifelse(runif( length(ix)) < 0.5, 0, 1)
data <- data %>% select(-X12) %>% mutate(Y =
as.factor(as.character(Y)))
ggplot(data,aes(x=X1,y=X2,color=Y))+geom_point()

linear_model <- glm(Y ~ ., family = binomial(link = "logit"),
data = data)
tree_model <- rpart( Y ~ ., data = data)
pred_linear <- predict(linear_model, data,type="response")
pred_tree <- predict(tree_model, data,type="prob")[,1]
data_pred <- data %>% mutate(pred_linear_class =
ifelse(pred_linear <0.5,0,1)) %>%mutate(pred_linear_class =
as.factor(as.character(pred_linear_class)))%>%
mutate(pred_tree_class = ifelse( pred_tree <0.5,0,1)) %>%
mutate( pred_tree_class =
as.factor(as.character(pred_tree_class)))
ggplot(data_pred,aes(x=X1,y=X2,color=pred_linear_class))+
geom_point()

ggplot(data_pred,aes(x=X1,y=X2,color=pred_tree_class))+
geom_point()

# AGE, PTGENDER and PTEDUCAT are used as the
# predictor variables.
# MMSCORE (a numeric value) is the outcome.
# Step 1: read data into R
url <- paste0("https://raw.githubusercontent.com",
"/analyticsbook/book/main/data/AD.csv")
AD <- read.csv(text=getURL(url))
# Step 2: data preprocessing
X <- AD[,2:16]
Y <- AD$MMSCORE
data <- data.frame(X,Y)
names(data)[16] <- c("MMSCORE")
# Create a training data (half the original data size)
train.ix <- sample(nrow(data),floor( nrow(data)/2) )
data.train <- data[train.ix,]
# Create a testing data (half the original data size)
data.test <- data[-train.ix,]
# Step 3: build the tree
# for regression problems, use method="anova"
tree_reg <- rpart( MMSCORE ~ ., data.train, method="anova")
# Step 4: draw the tree
require(rpart.plot)
## Loading required package: rpart.plot
prp(tree_reg, nn.cex=1)

# Step 5 -> prune the tree
tree_reg <- prune(tree_reg,cp=0.03)
prp(tree_reg,nn.cex=1)

# Step 6 -> Predict using your tree model
pred.tree <- predict(tree_reg, data.test)
cor(pred.tree, data.test$MMSCORE)
## [1] 0.3182434
#For regression model, you can use correlation
# to measure how close are your predictions
# with the true outcome values of the data points